In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from pathlib import Path
import re # For extracting player slug from filename

# Plotting style
try:
    plt.style.use('seaborn-whitegrid') # This worked for you
except OSError:
    print("Style 'seaborn-whitegrid' not found, trying 'ggplot'.")
    plt.style.use('ggplot')

sns.set_context("talk")

# --- Configuration ---
SEASONS_STR = "2022_2024" # Used to find files, matches your example output
BASE_PROCESSED_DATA_DIR = Path("../data/new/processed_data/")

# --- Discover Player Files ---
processed_files_pattern = f"*_stats_sentiment_{SEASONS_STR}.csv"
player_files_info = []

for f_path in BASE_PROCESSED_DATA_DIR.glob(processed_files_pattern):
    filename = f_path.name
    # Extract player slug: assumes format like "player_slug_stats_sentiment_2022_2024.csv"
    match = re.match(rf"(.+)_stats_sentiment_{SEASONS_STR}\.csv", filename)
    if match:
        player_slug = match.group(1)
        player_name_display = player_slug.replace('_', ' ').title()
        player_files_info.append({
            "slug": player_slug,
            "display_name": player_name_display,
            "file_path": f_path
        })
    else:
        # Check for the '_plus_' variant if the primary pattern fails
        match_plus = re.match(rf"(.+)_stats_plus_sentiment_{SEASONS_STR}\.csv", filename)
        if match_plus:
            player_slug = match_plus.group(1)
            player_name_display = player_slug.replace('_', ' ').title()
            player_files_info.append({
                "slug": player_slug,
                "display_name": player_name_display,
                "file_path": f_path
            })
        # Check for the '_fullsnap_' variant
        match_fullsnap = re.match(rf"(.+)_stats_plus_sentiment_fullsnap_{SEASONS_STR}\.csv", filename)
        if match_fullsnap:
            player_slug = match_fullsnap.group(1)
            player_name_display = player_slug.replace('_', ' ').title()
            player_files_info.append({
                "slug": player_slug,
                "display_name": player_name_display,
                "file_path": f_path
            })


if not player_files_info:
    print(f"❌ No processed player files found in {BASE_PROCESSED_DATA_DIR} matching the pattern '{processed_files_pattern}' or similar variations.")
    print("Please ensure 'process_all_players.py' has run and generated files like 'playername_stats_sentiment_2022_2024.csv'")
else:
    print(f"Found {len(player_files_info)} players to process:")
    for p_info in player_files_info:
        print(f"  - {p_info['display_name']} (File: {p_info['file_path'].name})")
Found 8 players to process:
  - Anthony Edwards (File: anthony_edwards_stats_sentiment_2022_2024.csv)
  - Donovan Mitchell (File: donovan_mitchell_stats_sentiment_2022_2024.csv)
  - Giannis Antetokounmpo (File: giannis_antetokounmpo_stats_sentiment_2022_2024.csv)
  - Jalen Brunson (File: jalen_brunson_stats_sentiment_2022_2024.csv)
  - Lebron James (File: lebron_james_stats_sentiment_2022_2024.csv)
  - Luka Doncic (File: luka_doncic_stats_sentiment_2022_2024.csv)
  - Shai Gilgeous-Alexander (File: shai_gilgeous-alexander_stats_sentiment_2022_2024.csv)
  - Stephen Curry (File: stephen_curry_stats_sentiment_2022_2024.csv)
In [6]:
# High-level Goal: Unsupervised exploration of NBA player performance and associated Reddit sentiment.
# This notebook will iterate through available processed player data, performing
# data loading, preprocessing, visualization, and correlation analysis.
print("Project Goal: Automated analysis for multiple players.")
Project Goal: Automated analysis for multiple players.
In [7]:
def load_and_preprocess_data(file_path, player_name_display):
    """Loads and preprocesses the combined stats and sentiment data for a player."""
    if not file_path.exists():
        print(f"❌ ERROR: Data file not found at {file_path} for {player_name_display}")
        return None

    print(f"\n--- Loading data for {player_name_display} from {file_path.name} ---")
    df = pd.read_csv(file_path)
    print(f"Initial rows loaded: {len(df)}")

    if 'game_date' not in df.columns:
        # Fallback if 'game_date' was not correctly named/saved by previous script, try 'GAME_DATE'
        if 'GAME_DATE' in df.columns:
            print("Using 'GAME_DATE' and converting to datetime.")
            df['game_date'] = pd.to_datetime(df['GAME_DATE'], errors='coerce')
        else:
            print(f"❌ ERROR: Neither 'game_date' nor 'GAME_DATE' column found for {player_name_display}.")
            return None
    else:
        df['game_date'] = pd.to_datetime(df['game_date'], errors='coerce')
    
    df.dropna(subset=['game_date'], inplace=True)

    if 'WL' in df.columns:
        df['WIN'] = df['WL'].apply(lambda x: 1 if x == 'W' else 0 if x == 'L' else np.nan)
        df.dropna(subset=['WIN'], inplace=True)
        df['WIN'] = df['WIN'].astype(int)
    else:
        print(f"⚠️ WL column not found for {player_name_display}. Cannot create WIN feature.")
        df['WIN'] = np.nan # Add WIN column as NaN if WL is missing

    print(f"Rows before filtering on post_count: {len(df)}")
    if 'post_count' in df.columns:
        df_filtered = df[df['post_count'] > 0].copy()
        print(f"Rows after filtering for post_count > 0: {len(df_filtered)}")
        if len(df_filtered) == 0:
            print(f"⚠️ No games found with post_count > 0 for {player_name_display}. Sentiment analysis will be limited.")
            # Return the original df if no posts, so at least stats can be analyzed if desired (optional)
            # For this project, we require sentiment, so we'll return None if no sentiment games
            return None 
        print(f"Filtered data to {len(df_filtered)} games with sentiment (post_count > 0).")
    else:
        print(f"❌ ERROR: 'post_count' column not found for {player_name_display}. Cannot filter for games with sentiment.")
        return None

        # --- Derived sentiment feature -------------------------------
    if {'mean_sentiment', 'post_count'}.issubset(df_filtered.columns):
        df_filtered['sent_intensity'] = (
            df_filtered['mean_sentiment'] * df_filtered['post_count']
        )
    else:
        df_filtered['sent_intensity'] = np.nan
        
    return df_filtered

print("Helper function 'load_and_preprocess_data' defined.")
Helper function 'load_and_preprocess_data' defined.
In [8]:
if not player_files_info:
    print("No player files were found to process. Halting execution.")
else:
    for player_info in player_files_info:
        PLAYER_SLUG = player_info["slug"]
        PLAYER_NAME_DISPLAY = player_info["display_name"]
        input_file_path = player_info["file_path"]

        print(f"\n\n{'='*20} PROCESSING: {PLAYER_NAME_DISPLAY} {'='*20}")
        
        df_player = load_and_preprocess_data(input_file_path, PLAYER_NAME_DISPLAY)

        if df_player is None or df_player.empty:
            print(f"--- Skipping further analysis for {PLAYER_NAME_DISPLAY} due to missing data or no games with sentiment ---")
            continue

        # --- 4.1: Univariate Distributions ---
        print(f"\n--- Univariate Distributions for {PLAYER_NAME_DISPLAY} ---")
        performance_features_to_plot = ['PTS', 'AST', 'REB', 'PLUS_MINUS', 'FG_PCT']
        # Check if 'neg_share' exists, it was added in the process_all_players.py
        sentiment_features_to_plot = ['mean_sentiment', 'pos_share', 
                                      'neg_share' if 'neg_share' in df_player.columns else 'min_sentiment', 
                                      'post_count', 'avg_delta_days']
        
        plt.figure(figsize=(18, 10))
        plot_idx = 1
        for col_list, sup_title_part in [(performance_features_to_plot, "Performance"), (sentiment_features_to_plot, "Sentiment Metrics")]:
            for col in col_list:
                if col in df_player.columns and not df_player[col].isnull().all():
                    plt.subplot(2, max(len(performance_features_to_plot), len(sentiment_features_to_plot)), plot_idx)
                    sns.histplot(df_player[col].dropna(), kde=True, bins=15) # dropna for individual columns
                    plt.title(f'Distribution of {col}')
                    plt.xlabel(col)
                    plt.ylabel('Frequency')
                    plot_idx +=1
                else:
                    print(f"  Skipping histogram for missing/empty column: {col}")
        plt.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust layout to make space for suptitle
        plt.suptitle(f'Univariate Distributions for {PLAYER_NAME_DISPLAY}', y=1.00, fontsize=16)
        plt.show()

        # --- 4.2: Correlation Analysis ---
        print(f"\n--- Correlation Analysis for {PLAYER_NAME_DISPLAY} ---")
        numerical_stats_cols = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'PLUS_MINUS', 'WIN', 'MIN']
        numerical_sentiment_cols = ['mean_sentiment', 'min_sentiment', 'max_sentiment', 'pos_share', 
                                    'neg_share' if 'neg_share' in df_player.columns else None, 
                                    'post_count', 'avg_delta_days', 'min_delta_days', 'max_delta_days']
        numerical_sentiment_cols = [col for col in numerical_sentiment_cols if col is not None] # Remove None if neg_share isn't there

        correlation_features = [col for col in numerical_stats_cols if col in df_player.columns and pd.api.types.is_numeric_dtype(df_player[col])] + \
                               [col for col in numerical_sentiment_cols if col in df_player.columns and pd.api.types.is_numeric_dtype(df_player[col])]
        
        df_corr = df_player[list(set(correlation_features))].copy() # Use set to ensure unique cols
        
        for col in df_corr.columns: # Coerce and drop NaNs carefully
            df_corr[col] = pd.to_numeric(df_corr[col], errors='coerce')
        df_corr.dropna(inplace=True)

        if len(df_corr) > 1 and len(df_corr.columns) > 1:
            correlation_matrix = df_corr.corr()
            plt.figure(figsize=(18, 15))
            sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, annot_kws={"size": 7})
            plt.title(f'Correlation Matrix for {PLAYER_NAME_DISPLAY}', fontsize=16)
            plt.xticks(rotation=45, ha='right', fontsize=8)
            plt.yticks(rotation=0, fontsize=8)
            plt.tight_layout()
            plt.show()
        else:
            print("  ⚠️ Not enough data or columns for correlation matrix.")
#---------------------------------------------------------------------------
        # --- 4.3B: Unsupervised Clustering (PCA + K‑means) ---------
        print(f"\n--- Unsupervised Clustering for {PLAYER_NAME_DISPLAY} ---")

        cluster_features = [
            'PTS', 'AST', 'REB', 'PLUS_MINUS',
            'mean_sentiment', 'sent_intensity'
        ]
        cluster_features = [c for c in cluster_features if c in df_player.columns]

        # 1) scale ---------------------------------------------------
        X = df_player[cluster_features].fillna(0)
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # 2) dimensionality reduction --------------------------------
        pca = PCA(n_components=2, random_state=42)
        X_pca = pca.fit_transform(X_scaled)
        print("   • PCA two‑component variance:",
              round(pca.explained_variance_ratio_.sum(), 3))

        # 3) elbow plot (quick) --------------------------------------
        inertias = []
        for k in range(1, 8):
            inertias.append(
                KMeans(n_clusters=k, n_init=10, random_state=42)
                .fit(X_scaled).inertia_
            )
        plt.figure(figsize=(4, 3))
        plt.plot(range(1, 8), inertias, marker='o')
        plt.title('Elbow for K‑means'); plt.xlabel('k'); plt.ylabel('Inertia')
        plt.show()

        # 4) fit K‑means (k=3 by default) ----------------------------
        k = 3
        kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
        df_player['cluster'] = kmeans.fit_predict(X_scaled)

        # 5) PCA scatter ---------------------------------------------
        plt.figure(figsize=(7, 6))
        sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1],
                        hue=df_player['cluster'], palette='Set2', s=80)
        plt.title(f'PCA space – {PLAYER_NAME_DISPLAY}')
        plt.xlabel('PC1'); plt.ylabel('PC2')
        plt.legend(title='Cluster')
        plt.show()

        # 6) cluster profile table -----------------------------------
        profile_cols = cluster_features + (['WIN'] if 'WIN' in df_player.columns else [])
        print("Cluster medians:")
        display(
            df_player.groupby('cluster')[profile_cols]
                     .median()
                     .round(2)
        )












#-------------------------------------------------------------------------------
        # --- 4.3: Bivariate Scatter Plots ---
        print(f"\n--- Bivariate Scatter Plots for {PLAYER_NAME_DISPLAY} ---")
        key_relationships = [
            ('PTS', 'mean_sentiment'), ('PLUS_MINUS', 'mean_sentiment'),
            ('PTS', 'post_count'), ('WIN', 'mean_sentiment'),
            ('FG_PCT', 'mean_sentiment'), ('mean_sentiment', 'avg_delta_days')
        ]
        plt.figure(figsize=(20, 10)) # Adjusted size for 2 rows if needed
        plot_idx = 1
        for x_col, y_col in key_relationships:
            if x_col in df_player.columns and y_col in df_player.columns:
                plt.subplot(2, 3, plot_idx)
                if x_col == 'WIN' and not df_player[x_col].isnull().all():
                    sns.boxplot(x=df_player[x_col].astype(int), y=df_player[y_col].dropna(), palette={0: 'red', 1: 'green'})
                    plt.xticks([0,1], ['Loss', 'Win'])
                elif not df_player[x_col].isnull().all() and not df_player[y_col].isnull().all():
                    sns.scatterplot(data=df_player, x=x_col, y=y_col, hue='WIN' if 'WIN' in df_player.columns else None, 
                                    palette={0: 'red', 1: 'green'} if 'WIN' in df_player.columns else None, alpha=0.7)
                    try: # Add regplot only if there's enough variance
                        sns.regplot(data=df_player, x=x_col, y=y_col, scatter=False, color='blue', line_kws={'linestyle':'--'})
                    except ValueError:
                        print(f"Could not plot regression line for {x_col} vs {y_col} (possibly due to low variance or NaNs).")
                plt.title(f'{x_col} vs. {y_col}')
                plot_idx += 1
            else:
                 print(f"  Skipping bivariate plot: missing {x_col} or {y_col}")
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.suptitle(f'Key Bivariate Relationships for {PLAYER_NAME_DISPLAY}', y=1.00, fontsize=18)
        plt.show()

        # --- 4.4: Sentiment by Game Outcome ---
        if 'WIN' in df_player.columns and not df_player['WIN'].isnull().all():
            print(f"\n--- Sentiment Analysis by Game Outcome for {PLAYER_NAME_DISPLAY} ---")
            plt.figure(figsize=(18, 6))
            sentiment_metrics_for_outcome = ['mean_sentiment', 'pos_share', 
                                             'neg_share' if 'neg_share' in df_player.columns else 'min_sentiment', 
                                             'post_count']
            plot_idx = 1
            for metric in sentiment_metrics_for_outcome:
                if metric in df_player.columns and not df_player[metric].isnull().all():
                    plt.subplot(1, len(sentiment_metrics_for_outcome), plot_idx)
                    sns.boxplot(x='WIN', y=metric, data=df_player, palette={0: 'salmon', 1: 'lightgreen'})
                    plt.title(f'{metric} by Outcome')
                    plt.xticks([0, 1], ['Loss', 'Win'])
                    plt.xlabel('Game Outcome')
                    plt.ylabel(metric.replace('_', ' ').title())
                    plot_idx +=1
                else:
                    print(f"  Skipping outcome plot for missing/empty column: {metric}")
            plt.tight_layout(rect=[0, 0, 1, 0.96])
            plt.suptitle(f'Sentiment Metrics by Game Outcome for {PLAYER_NAME_DISPLAY}', y=1.00, fontsize=16)
            plt.show()
        else:
            print(f"  Skipping sentiment by game outcome for {PLAYER_NAME_DISPLAY} (WIN column missing or all NaN).")
            
        # Optional: Add a small delay or clear plots if running for many players to manage memory/display
        # import time
        # time.sleep(1) 
        # from IPython.display import clear_output
        # clear_output(wait=True)
        
        print(f"\n--- Finished processing {PLAYER_NAME_DISPLAY} ---")

print("\n\n🏁 All Players Processed! 🏁")

==================== PROCESSING: Anthony Edwards ====================

--- Loading data for Anthony Edwards from anthony_edwards_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 237
Rows before filtering on post_count: 237
Rows after filtering for post_count > 0: 27
Filtered data to 27 games with sentiment (post_count > 0).

--- Univariate Distributions for Anthony Edwards ---
No description has been provided for this image
--- Correlation Analysis for Anthony Edwards ---
No description has been provided for this image
--- Unsupervised Clustering for Anthony Edwards ---
   • PCA two‑component variance: 0.568
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 27.5 5.0 5.0 0.5 0.67 0.75 1.0
1 15.0 7.0 13.0 -3.5 0.54 1.08 1.0
2 36.0 2.0 6.0 -2.0 0.03 0.04 0.0
--- Bivariate Scatter Plots for Anthony Edwards ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Anthony Edwards ---
No description has been provided for this image
--- Finished processing Anthony Edwards ---


==================== PROCESSING: Donovan Mitchell ====================

--- Loading data for Donovan Mitchell from donovan_mitchell_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 194
Rows before filtering on post_count: 194
Rows after filtering for post_count > 0: 24
Filtered data to 24 games with sentiment (post_count > 0).

--- Univariate Distributions for Donovan Mitchell ---
No description has been provided for this image
--- Correlation Analysis for Donovan Mitchell ---
No description has been provided for this image
--- Unsupervised Clustering for Donovan Mitchell ---
   • PCA two‑component variance: 0.56
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 12.5 4.5 3.0 5.5 0.73 0.73 0.5
1 30.0 6.0 5.0 15.5 0.66 0.85 1.0
2 27.0 7.0 6.5 5.0 -0.02 -0.02 0.5
--- Bivariate Scatter Plots for Donovan Mitchell ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Donovan Mitchell ---
No description has been provided for this image
--- Finished processing Donovan Mitchell ---


==================== PROCESSING: Giannis Antetokounmpo ====================

--- Loading data for Giannis Antetokounmpo from giannis_antetokounmpo_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 203
Rows before filtering on post_count: 203
Rows after filtering for post_count > 0: 24
Filtered data to 24 games with sentiment (post_count > 0).

--- Univariate Distributions for Giannis Antetokounmpo ---
No description has been provided for this image
--- Correlation Analysis for Giannis Antetokounmpo ---
No description has been provided for this image
--- Unsupervised Clustering for Giannis Antetokounmpo ---
   • PCA two‑component variance: 0.623
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 33.5 4.5 12.0 -6.5 0.66 0.66 0.0
1 26.0 10.0 12.0 22.0 0.66 0.66 1.0
2 27.0 6.0 10.0 1.0 -0.73 -0.73 0.0
--- Bivariate Scatter Plots for Giannis Antetokounmpo ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Giannis Antetokounmpo ---
No description has been provided for this image
--- Finished processing Giannis Antetokounmpo ---


==================== PROCESSING: Jalen Brunson ====================

--- Loading data for Jalen Brunson from jalen_brunson_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 210
Rows before filtering on post_count: 210
Rows after filtering for post_count > 0: 24
Filtered data to 24 games with sentiment (post_count > 0).

--- Univariate Distributions for Jalen Brunson ---
No description has been provided for this image
--- Correlation Analysis for Jalen Brunson ---
No description has been provided for this image
--- Unsupervised Clustering for Jalen Brunson ---
   • PCA two‑component variance: 0.583
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 24.0 7.0 3.0 10.0 0.60 0.95 1.0
1 24.0 5.0 1.5 -15.0 0.66 0.66 0.0
2 37.0 9.0 3.0 -1.0 0.67 0.67 1.0
--- Bivariate Scatter Plots for Jalen Brunson ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Jalen Brunson ---
No description has been provided for this image
--- Finished processing Jalen Brunson ---


==================== PROCESSING: Lebron James ====================

--- Loading data for Lebron James from lebron_james_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 196
Rows before filtering on post_count: 196
Rows after filtering for post_count > 0: 21
Filtered data to 21 games with sentiment (post_count > 0).

--- Univariate Distributions for Lebron James ---
No description has been provided for this image
--- Correlation Analysis for Lebron James ---
No description has been provided for this image
--- Unsupervised Clustering for Lebron James ---
   • PCA two‑component variance: 0.535
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 33.0 8.0 8.0 8.0 0.69 1.48 1.0
1 22.0 9.0 7.0 -1.0 0.20 0.20 1.0
2 25.0 7.0 6.0 -1.0 0.78 0.80 1.0
--- Bivariate Scatter Plots for Lebron James ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Lebron James ---
No description has been provided for this image
--- Finished processing Lebron James ---


==================== PROCESSING: Luka Doncic ====================

--- Loading data for Luka Doncic from luka_doncic_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 186
Rows before filtering on post_count: 186
Rows after filtering for post_count > 0: 15
Filtered data to 15 games with sentiment (post_count > 0).

--- Univariate Distributions for Luka Doncic ---
No description has been provided for this image
--- Correlation Analysis for Luka Doncic ---
No description has been provided for this image
--- Unsupervised Clustering for Luka Doncic ---
   • PCA two‑component variance: 0.635
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 31.5 7.0 8.0 7.0 0.66 0.88 1.0
1 21.0 12.0 11.5 10.5 0.59 1.03 1.0
2 14.0 4.0 5.0 15.0 0.06 0.12 1.0
--- Bivariate Scatter Plots for Luka Doncic ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Luka Doncic ---
No description has been provided for this image
--- Finished processing Luka Doncic ---


==================== PROCESSING: Shai Gilgeous-Alexander ====================

--- Loading data for Shai Gilgeous-Alexander from shai_gilgeous-alexander_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 219
Rows before filtering on post_count: 219
Rows after filtering for post_count > 0: 31
Filtered data to 31 games with sentiment (post_count > 0).

--- Univariate Distributions for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Correlation Analysis for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Unsupervised Clustering for Shai Gilgeous-Alexander ---
   • PCA two‑component variance: 0.545
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 25.0 8.0 3.0 8.0 0.29 0.29 1.0
1 33.0 6.0 5.0 16.0 0.66 0.67 1.0
2 22.0 4.0 5.0 -6.0 0.74 0.82 0.0
--- Bivariate Scatter Plots for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Shai Gilgeous-Alexander ---
No description has been provided for this image
--- Finished processing Shai Gilgeous-Alexander ---


==================== PROCESSING: Stephen Curry ====================

--- Loading data for Stephen Curry from stephen_curry_stats_sentiment_2022_2024.csv ---
Initial rows loaded: 200
Rows before filtering on post_count: 200
Rows after filtering for post_count > 0: 27
Filtered data to 27 games with sentiment (post_count > 0).

--- Univariate Distributions for Stephen Curry ---
No description has been provided for this image
--- Correlation Analysis for Stephen Curry ---
No description has been provided for this image
--- Unsupervised Clustering for Stephen Curry ---
   • PCA two‑component variance: 0.545
C:\Users\realo\AppData\Roaming\Python\Python37\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 8209 (\N{NON-BREAKING HYPHEN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
No description has been provided for this image
Cluster medians:
PTS AST REB PLUS_MINUS mean_sentiment sent_intensity WIN
cluster
0 31.0 8.0 5.0 15.0 0.41 0.52 1.0
1 17.0 5.0 1.0 -2.0 0.71 1.96 1.0
2 25.0 5.0 5.0 6.0 0.73 0.78 1.0
--- Bivariate Scatter Plots for Stephen Curry ---
No description has been provided for this image
--- Sentiment Analysis by Game Outcome for Stephen Curry ---
No description has been provided for this image
--- Finished processing Stephen Curry ---


🏁 All Players Processed! 🏁